* ========================================================================= *
*                                                                           *
*   TEXAS INSTRUMENTS, INC.                                                 *
*                                                                           *
*   NAME                                                                    *
*       scale_horz                                                          *
*                                                                           *
*                                                                           *
*   USAGE                                                                   *
*       This routine is C-callable and can be called as:                    *
*                                                                           *
*           void scale_horz                                                 *
*           (                                                               *
*               unsigned short *in_data,  /* Ptr to unscaled lines      */  *
*               unsigned int    in_len,   /* Pixels/line unscaled       */  *
*               short          *out_data, /* Ptr to scaled data lines   */  *
*               unsigned int    out_len,  /* Pixels/line of scaled data */  *
*               short          *hh,       /* Ptr to filter taps,            *
*                                            interleaved odd/even           *
*                                            outputs                    */  *
*               unsigned int    l_hh,     /* Length of scaling filters  */  *
*               unsigned int    n_hh,     /* Number of scaling filters  */  *
*               short          *patch     /* Ptr to decrement pattern   */  *
*           );                                                              *
*                                                                           *
*   DESCRIPTION                                                             *
*                                                                           *
*       This code can scale up or down 1 line of data, in the               *
*       ratio out_len : in_len.  e.g 1 to 3, 4:3, 5:6. The                  *
*       filters are designed outside of the loop using a                    *
*       general purpose resizing algorithm.                                 *
*                                                                           *
*           patch0 = patch + 2;                                             *
*           filter_count = n_hh;                                            *
*           ka = 0;                                                         *
*                                                                           *
*           line0_x = plane_x;                                              *
*           line0_y = plane_y;                                              *
*           ptr_hh = hh;                                                    *
*           jump = (int) patch[0]; ka = jump >> 1;                          *
*           jump = (int) patch[1]; kb = jump >> 1;                          *
*                                                                           *
*           for ( i = 0; i < n_y; i += 2)                                   *
*           {                                                               *
*               y0 = 1 << 5;                                                *
*               y1 = 1 << 5;                                                *
*               for ( j = 0; j < l_hh; j+=4)                                *
*               {                                                           *
*                   /* even outputs */                                      *
*                   for (k=0; k < 4; k++)                                   *
*                   {                                                       *
*                       h0 = *ptr_hh++;                                     *
*                       x0 = *(line0_x+ ka + k);                            *
*                       y0 += ( x0 * h0 );                                  *
*                   }                                                       *
*                   jump = (int) (*patch0++);                               *
*                   ka = ka + (jump>>1);                                    *
*                   /* odd outputs */                                       *
*                   for (k=0; k < 4; k++)                                   *
*                   {                                                       *
*                       h1 = *ptr_hh++;                                     *
*                       x1 = *(line0_x + kb + k);                           *
*                       y1 += ( x1 * h1 );                                  *
*                   }                                                       *
*                   jump = (int) (*patch0++);                               *
*                   kb = kb + (jump>>1);                                    *
*               }                                                           *
*               *line0_y++ = (short) (y0 >> 6) ;                            *
*               *line0_y++ = (short) (y1 >> 6) ;                            *
*                                                                           *
*               filter_count -= 2;                                          *
*               if (!filter_count)                                          *
*               {                                                           *
*                   patch0 = patch + 2;                                     *
*                   ptr_hh = hh;                                            *
*                   filter_count = n_hh;                                    *
*               }                                                           *
*           }                                                               *
*                                                                           *
*   ASSUMPTIONS                                                             *
*       One line of data is produced per function call.                     *
*                                                                           *
*       The line must be aligned on a double word boundary and be a         *
*       multiples of 8 bytes.                                               *
*                                                                           *
*       Filters are multiples of 4 taps, maximum number of filters is 256.  *
*                                                                           *
*       The computations for each output are interleaved, thus the filters  *
*       are interleaved on a 4 short interval.                              *
*                                                                           *
*       Little ENDIAN Configuration is used and the input and output data   *
*       is 16 bit unsinged and signed shorts respectively.  The filters     *
*       are also 16 bit signed shorts in 12 bit precision.                  *
*                                                                           *
*       The n_hh filters are all of the same length and are                 *
*       strung together in a single linear array.                           *
*                                                                           *
*       Interrupts are masked by the function for most of its duration.     *
*                                                                           *
*   MEMORY NOTE                                                             *
*       Some bank hits will occur in this code for certain scale            *
*       factors and filter lengths.                                         *
*                                                                           *
*       For 4 taps k = 0, for l_hh 8, k = 0.031, for l_hh = 16, k = 0.015.  *
*       Different flter lengths can produce different numbers of bank       *
*       conflicts.  Overall, these bank conflicts have nearly zero effect.  *
*                                                                           *
*       For l_hh=4: k=0, l_hh=8: k=1/32, l_hh=12: k=0, l_hh=16: k=1/64      *
*       For l_hh % 8 == 0, k = 1/(4*l_hh) else k = 0                        *
*                                                                           *
*       'k' is the bank conflict between the store and the guidance table   *
*       load.  Depending on the relative sizes of the filters and           *
*       memory width, this bank conflict is between 0 and 3.1%              *
*       overhead.                                                           *
*                                                                           *
*   TECHNIQUES                                                              *
*       The outputs are computed using interleaved inputs. The patch table  *
*       controls the access of 2 parallel pointers. For example an 8/33     *
*       scale factor will have the following access pattern.                *
*                                                                           *
*                 11111111112222222222333333333344444444445555555555        *
*       012345678901234567890123456789012345678901234567890123456789        *
*                                                                           *
*       0  e xxxxxxxx     <-start point of even output 0                    *
*       1      o xxxxxxxx      <-start point of odd output 4                *
*       2          e xxxxxxxx                                               *
*       3              o xxxxxxxx                                           *
*       4                  e xxxxxxxx                                       *
*       5                      o xxxxxxxx                                   *
*       6                          e xxxxxxxx                               *
*       7                              o xxxxxxxx                           *
*       0                                   e xxxxxxxx  <-next start        *
*       1                                       o xxxxxxxx  <-next start    *
*                                                                           *
*                                                                           *
*       From this diagram the odd pointer jumps 4 then another 4 as the     *
*       filters have 8 taps, it then jumps 4 to get to the next set of      *
*       input data. The odd pointer does the same. These jumps are          *
*       interleaved and so are the filter coefficients. The jumps are       *
*       in multiples of bytes as non-scaled non-aligned double word         *
*       accesses are used.  In this case the table will be:                 *
*                                                                           *
*           short patch[] = {0,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,10,10,8,8};    *
*                                                                           *
*       Notice the first 2 entries are the intial starting points for       *
*       the two pointers. To remove a dependency in the code the last 2     *
*       entries are copies of the 2nd two. This makes the table almost      *
*       circular.                                                           *
*                                                                           *
*   NOTES                                                                   *
*       Other scale factors can be achieved with the following              *
*       example tables.                                                     *
*                                                                           *
*   Scale Factor Taps  Table short jump[] =                                 *
*   --------------------------------------------------------------------    *
*       5/6       4    {0, 1, 2, 2, 2, 3, 3, 2, 2, 2, 3, 3, 2, 2}           *
*       4/3       8    {0, 4, 4, 4, -3, -2, 4, 4, -2, -3, 4, 4}             *
*       3/4       12   {0,1,4,4,4,4,-6,-5,4,4,4,4,-5,-6,4,4,4,4,-5,-5,4,4}  *
*       6/5       16   {0,0,4,4,4,4,4,4,-11,-10,4,4,4,4,4,4,-10,-10,        *
*                       4,4,4,4,4,4,-10,-11,4,4}                            *
*                                                                           *
*       The software to produce these tables and the simple coefficents     *
*       for an arbitarary scale factor and number of taps can be found      *
*       in the api document. Note in the case of 3/4, odd scale factors     *
*       are doubled to make 6/8 instead of 3/4                              *
*                                                                           *
*   CYCLES                                                                  *
*       cycles = 0.5 * out_len * l_hh * (1+k) + 30.                         *
*       If (l_hh % 8) == 0 then k = 1/(4*l_hh) else k = 0.                  *
*                                                                           *
*       For l_hh = 16, in_len = 1024, and out_len = 1366,  cycles = 11129.  *
*       For l_hh = 8,  in_len = 640,  and out_len = 120,   cycles = 525.    *
*                                                                           *
*   CODESIZE                                                                *
*       452 bytes                                                           *
* ------------------------------------------------------------------------- *
*             Copyright (c) 2001 Texas Instruments, Incorporated.           *
*                            All Rights Reserved.                           *
* ========================================================================= *
                .sect ".data:copyright_h"
_Copyright:     .string "Copyright (C) 2001 Texas Instruments Incorporated. "
                .string "All Rights Reserved."
                .include "scale_horz_h.h62"
_scale_horz_asm: 
* ===================== SYMBOLIC REGISTER ASSIGNMENTS ===================== *
        .asg     A4,         A_plane_x    ;
        .asg     B4,         B_n_x        ;
        .asg     A6,         A_plane_y    ;
        .asg     B6,         B_n_y        ;
        .asg     A8,         A_hh         ;
        .asg     B8,         B_l_hh       ;
        .asg     A10,        A_n_hh       ;
        .asg     B10,        B_patch      ;
        .asg     A12,        A_filt_state ;
        .asg     B18,        B_filt_no    ;
        .asg     B16,        B_hh0        ;
        .asg     A23,        A_ptr_hh     ;
        .asg     B24,        B_ptr_hh     ;
        .asg     A9,         A_line0_x0   ;
        .asg     B9,         B_line0_x1   ;
        .asg     A18,        A_ka         ;
        .asg     B17,        B_kb         ;
        .asg     A17,        A_jump10     ;
        .asg     A19,        A_kbka       ;
        .asg     A22,        A_patch0     ;
        .asg     A20,        A_y0         ;
        .asg     B20,        B_y1         ;
        .asg     B21,        B_line0_y    ;
        .asg     A7,         A_round      ;
        .asg     A0,         A_taps       ;
        .asg     A5,         A_l_hh       ;
        .asg     B0,         B_f_cnt      ;
        .asg     A3,         A_patch      ;
        .asg     B7,         B_hh         ;
        .asg     B5,         B_max_filt   ;
        .asg     B22,        B_i          ;
        .asg     A27,        A_h03h02     ;
        .asg     A26,        A_h01h00     ;
        .asg     B27,        B_h13h12     ;
        .asg     B26,        B_h11h10     ;
        .asg     A17,        A_x03x02     ;
        .asg     A16,        A_x01x00     ;
        .asg     B17,        B_x13x12     ;
        .asg     B16,        B_x11x10     ;
        .asg     B19,        B_one        ;
        .asg     A24,        A_p00        ;
        .asg     A21,        A_p01        ;
        .asg     A18,        A_p0         ;
        .asg     B23,        B_p10        ;
        .asg     B28,        B_p11        ;
        .asg     B18,        B_p1         ;
        .asg     A16,        A_t_y0       ;
        .asg     B19,        B_t_y1       ;
        .asg     B16,        B_t_y10      ;
        .asg     A22,        A_k1k0       ;
        .asg     A28,        A_kbka_      ;
        .asg     B29,        B_csr        ;
        .asg     B22,        B_csr_no_gie ;
* =========================== PIPE LOOP PROLOG ============================ *
        LDW   .D1T2 *A_filt_state[0],      B_f_cnt      ;[ 2,0] 
||      MVC     .S2 CSR,        B_csr                   ;

        ZERO    .L1 A_p01                               ;
||      AND     .S2 B_csr,      -2,        B_csr_no_gie ;

        ZERO    .L2 B_p10                               ;
||      MVC     .S2 B_csr_no_gie,          CSR          ;

        MPY    .M2X B_l_hh,     A_n_hh,    B_max_filt   ;[5,0]max_flt=l_hh*n_hh

        ZERO    .L2 B_p11                               ;
||      ZERO    .L1 A_p00                               ;

        MV     .L2X A_hh,       B_hh                    ;[ 7,0] 
||      SUB     .D2 B_max_filt, B_f_cnt, B_filt_no      ;[ 7,0]filt_cnt=max_flt
||      MV     .D1X B_patch,    A_patch                 ;[ 7,0]patch0=patch

        ADDAH   .D2 B_hh,       B_filt_no, B_hh0        ;[ 8,0] 
||      LDW   .D1T1 *A_patch[0],           A_kbka       ;[ 8,0]initial start
||      SHRU    .S2 B_filt_no,  1,         B_filt_no    ;[ 8,0] 

        ADD     .L1 A_patch,    4,         A_patch0     ;[ 9,0]patch0=patch
||      ZERO    .S1 A_h01h00                            ;

        ADD    .D1X A_patch0,   B_filt_no, A_patch0     ;[10,0] 
||      ZERO    .L2 B_h13h12                            ;
||      B       .S1 LOOPY + 20                          ;

        LDW   .D1T1 *A_patch0++[1],        A_jump10     ;[11,0] first offset
||      B       .S1 LOOPY1 + 12                         ;

        MPY     .M2 B_l_hh,     B_n_y,     B_i          ;[12,0] 
||      MV     .L2X A_plane_y,  B_line0_y               ;[12,0]line0_y=plane_y
||      MVK     .S1 020h,       A_round                 ;[12,0] 
||      MVK     .D2 1,          B_one                   ;[12,0] 
||      MPY    .M1X 1,          B_l_hh,    A_l_hh       ;[12,0] 
||      B       .S2 LOOPY2 + 12                         ;

        MPYLH  .M2X B_one,      A_kbka,    B_kb         ;[13,0] 
||      MPY     .M1 1,          A_kbka,    A_ka         ;[13,0] 
||      B       .S1 LOOPY3 + 20                         ;

        SHRU    .S2 B_i,        3,         B_i          ;[14,0] 
||      MPY    .M2X 1,          A_round,   B_y1         ;[14,0] y1 = 1 << 5
||      ADD     .L1 12,         A_l_hh,    A_taps       ;[14,0]taps=l_hh
||      ROTL    .M1 A_plane_x,  0,         A_line0_x0   ;[15,0]line0_x=plane_x
||      B       .S1 LOOPY                               ;

        SUB     .S2 B_i,        3,         B_i          ;[15,0]2 + 1
||      ADD     .D2 B_hh0,      8,         B_ptr_hh     ;[15,0]ptr_hh=hh+1
||      MV     .L1X B_hh0,      A_ptr_hh                ;[15,0]ptr_hh = hh
||      MV     .L2X A_plane_x,  B_line0_x1              ;[15,0]line1_x=plane_x
||      MV      .S1 A_round,    A_y0                    ;[15,0]y0 = 1 << 5
||      LDW   .D1T1 *A_patch0++[1],        A_jump10     ;[1,1]load next offst
* =========================== PIPE LOOP KERNEL ============================ *
LOOPY:
        SHR     .S2 B_y1,       6,         B_t_y1       ;[18,1] 
||      SHR     .S1 A_y0,       6,         A_t_y0       ;[18,1] 
||      SUB     .L1 A_taps,     4,         A_taps       ;[18,1] 
||      DOTP2   .M1 A_h01h00,   A_x01x00,  A_p00        ;[10,3] 
||      DOTP2   .M2 B_h13h12,   B_x13x12,  B_p11        ;[10,3] 
||      LDDW  .D1T1 *A_ptr_hh++[2], A_h03h02:A_h01h00   ;[ 2,5]h3:0=*ptr_hh++
||      LDDW  .D2T2 *B_ptr_hh++[2], B_h13h12:B_h11h10   ;[ 2,5]h3:0=*ptr_hh++
||      SUB     .L2 B_f_cnt,    8,         B_f_cnt      ;[ 2,5]fil_count-=8
LOOPY1:
  [!A_taps]MPY  .M1 1,          A_round,   A_y0         ;[19,1]if(!samp)y0=y1=0
||[!A_taps]MPY .M2X 1,          A_round,   B_y1         ;[19,1]if(!samp)y0=y1=0
||      ADD     .S2 B_p10,      B_p11,     B_p1         ;[15,2] 
||[!B_f_cnt]MV .S1X B_hh,       A_ptr_hh                ;[3,5](!flt_c)p_hh=hh
||      ADD2    .D1 A_jump10,   A_kbka,    A_kbka       ;[3,5] 
||[!B_f_cnt]ADD .L2 B_hh,       8,         B_ptr_hh     ;[3,5](!flt_c)p_hh=hh+4
||[!B_f_cnt]ADD .L1 A_patch,    8,         A_patch0     ;[3,5](!flt_c)ptch0=ptch
||      LDNDW .D2T2 *B_line0_x1(B_kb), B_x13x12:B_x11x10;[3,5]x3:0=*(line0_x+kb)
LOOPY2:
        PACK2  .L2X B_t_y1,     A_t_y0,    B_t_y10      ;[20,1] 
||      ADD     .S1 A_p00,      A_p01,     A_p0         ;[16,2] 
||      DOTP2   .M2 B_h11h10,   B_x11x10,  B_p10        ;[ 8,4] 
||      BDEC    .S2 LOOPY,      B_i                     ;[16,2] 
||      LDNDW .D1T1 *A_line0_x0(A_ka), A_x03x02:A_x01x00;[4,5]x3:0=*(line0_x+ka)
||      MPY     .M1 1,          A_kbka,    A_ka         ;[ 4,5] 
||      MVK     .D2 1,          B_one                   ;[ 4,5] 
LOOPY3:
  [!A_taps]MV   .L1 A_l_hh,     A_taps                  ;[21,1]taps=l_hh
||[!A_taps]STW.D2T2 B_t_y10,    *B_line0_y++[1]         ;[21,1]if(!samp)
||      ADD     .S1 A_p0,       A_y0,      A_y0         ;[17,2] *line0_y++=t_y0
||      ADD     .S2 B_p1,       B_y1,      B_y1         ;[17,2] 
||      DOTP2   .M1 A_h03h02,   A_x03x02,  A_p01        ;[9,4] 
||[!B_f_cnt]MV  .L2 B_max_filt, B_f_cnt                 ;[5,5]if(!flt_c)
||      MPYLH  .M2X B_one,      A_kbka,    B_kb         ;[5,5]  flt_cnt=max_flt
||      LDW   .D1T1 *A_patch0++[1],        A_jump10     ;[1,6]load next offst
* =========================== PIPE LOOP EPILOG ============================ *
        SHR     .S2 B_y1,       6,         B_t_y1       ;[18,3] 
||      SHR     .S1 A_y0,       6,         A_t_y0       ;[18,3] 
||      SUB     .L1 A_taps,     4,         A_taps       ;[18,3] 
||      DOTP2   .M1 A_h01h00,   A_x01x00,  A_p00        ;[10,5] 
||      DOTP2   .M2 B_h13h12,   B_x13x12,  B_p11        ;[10,5] 
||      LDW   .D1T1 *A_patch[0],A_k1k0                  ;

  [!A_taps]MPY  .M1 1,          A_round,   A_y0         ;[19,3]if(!samp)y0=y1=0;
||[!A_taps]MPY .M2X 1,          A_round,   B_y1         ;[19,3]if(!samp)y0=y1=0;
||      ADD     .S2 B_p10,      B_p11,     B_p1         ;[15,4] 

        PACK2  .L2X B_t_y1,     A_t_y0,    B_t_y10      ;[20,3] 
||      ADD     .L1 A_p00,      A_p01,     A_p0         ;[16,4] 
||      DOTP2   .M2 B_h11h10,   B_x11x10,  B_p10        ;[ 8,6] 
||      B       .S1 PIPE_DOWN                           ;
||      MVK     .S2 0,          B_i                     ;

  [!A_taps]MV   .L1 A_l_hh,     A_taps                  ;[21,3]taps = l_hh
||[!A_taps]STW.D2T2 B_t_y10,    *B_line0_y++[1]         ;[21,3]if(!sample)
||      ADD     .S1 A_p0,       A_y0,      A_y0         ;[17,4]  *line0_y++=t_y0
||      ADD     .S2 B_p1,       B_y1,      B_y1         ;[17,4] 
||      DOTP2   .M1 A_h03h02,   A_x03x02,  A_p01        ;[ 9,6] 
PIPE_DOWN:
        SHR     .S2 B_y1,       6,         B_t_y1       ;[18,4] 
||      SHR     .S1 A_y0,       6,         A_t_y0       ;[18,4] 
||      SUB     .L1 A_taps,     4,         A_taps       ;[18,4] 
||      DOTP2   .M1 A_h01h00,   A_x01x00,  A_p00        ;[10,6] 
||      DOTP2   .M2 B_h13h12,   B_x13x12,  B_p11        ;[10,6] 

  [!A_taps]MPY  .M1 1,          A_round,   A_y0         ;[19,4]if(!samp)y0=y1=0
||[!A_taps]MPY .M2X 1,          A_round,   B_y1         ;[19,4]if(!samp)y0=y1=0
||      ADD     .S2 B_p10,      B_p11,     B_p1         ;[15,5] 
||      SUB2    .L1 A_kbka,     A_k1k0,    A_kbka_      ;

        PACK2  .L2X B_t_y1,     A_t_y0,    B_t_y10      ;[20,4] 
||      ADD     .L1 A_p00,      A_p01,     A_p0         ;[16,5] 
||      BDEC    .S2 PIPE_DOWN,  B_i                     ;
||      SHR2    .S1 A_kbka_,    1,         A_kbka_      ;

  [!A_taps]MV   .L1 A_l_hh,     A_taps                  ;[21,4]taps=l_hh
||      ADD     .S1 A_p0,       A_y0,      A_y0         ;[17,5] 
||      ADD     .S2 B_p1,       B_y1,      B_y1         ;[17,5] 
||[!A_taps]STW.D2T2 B_t_y10,    *B_line0_y++[1]         ;[21,4](!samp)
                                                        ;       *line0_y++=t_y0
* ========================================================================= *
        BNOP    .S2 B3,         2                       ;return to call

        STW   .D1T1 A_kbka_,    *A_filt_state[1]        ;

        STW   .D1T2 B_f_cnt,    *A_filt_state[2]        ;

        MVC     .S2 B_csr,      CSR                     ;
        ;BRANCH OCCURS 
* ========================================================================= *
*   End of file:  scale_horz_h.asm                                          *
* ------------------------------------------------------------------------- *
*             Copyright (c) 2001 Texas Instruments, Incorporated.           *
*                            All Rights Reserved.                           *
* ========================================================================= *

